In [3]:
import nltk
import pandas as pd
NTLK Documentation
In [4]:
sentence = """At eight o'clock on Thursday morning
... Arthur didn't feel very good."""
In [5]:
tokens = nltk.word_tokenize(sentence)
In [6]:
tokens
Out[6]:
In [7]:
tagged = nltk.pos_tag(tokens)
In [8]:
tagged
Out[8]:
Look at this list of tags (https://pythonprogramming.net/natural-language-toolkit-nltk-part-speech-tagging)
In [9]:
df = pd.read_csv('allPostText_test.csv')
In [10]:
df.info()
In [11]:
def vec(name):
tokens = nltk.word_tokenize(name)
tagged = nltk.pos_tag(tokens)
return tagged
In [12]:
df['Text'].apply(vec).head(10)
Out[12]:
In [13]:
df['Tags'] = df['Text'].apply(vec)
In [14]:
def token(tags):
mini_list = []
for elem in tags:
if elem[1] == 'NNP':
mini_list.append(elem[0])
return mini_list
In [15]:
df['People list'] = df['Tags'].apply(token)
In [16]:
df['People list'].head(10)
Out[16]:
In [17]:
# This looks promising
sudo python -m nltk.downloader all
if you have problems
In [54]:
for sent in nltk.sent_tokenize(df['Text'][1]):
for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
if hasattr(chunk, 'label'):
print(chunk.label()+',', ' '.join(c[0] for c in chunk))
In [61]:
def peopled(elem):
mini_list = []
for sent in nltk.sent_tokenize(elem):
for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
if hasattr(chunk, 'label'):
p = chunk.label(), ' '.join(c[0] for c in chunk)
mini_list.append(p)
return mini_list
In [63]:
df['people'] = df['Text'].apply(peopled)
In [65]:
lst = list(df['people'])
In [71]:
lst = [x for x in lst if x !=[]]
In [72]:
flat_list = [item for sublist in lst for item in sublist]
In [74]:
name_list = []
for name in flat_list:
if name[0] == 'PERSON':
name_list.append(name[1])
In [79]:
pd.DataFrame(name_list)[0].value_counts()
Out[79]: